Tp 1 - Tweets sobre covid-19. Buscando patrones interesantes.
library("ggplot2")
library("readr")
library("dplyr")
library("highcharter")
library("treemap")
library("modeest")
library("GGally")
library("tidyverse")
library("hrbrthemes")
library("tidyr")
library("VIM")
library("e1071")
library("mice")
library("mongolite")
library("SnowballC")
library("tm")
library("twitteR")
library("syuzhet")
library("tidyverse")
library("lubridate")
library("RColorBrewer")
library("infotheo"); # Discretize variable
tweets <- mongo(collection = "tweets_mongo_covid19", db = "DMUBA")
names(tweets$find())
[1] "user_id" "status_id" "created_at" "screen_name"
[5] "text" "source" "is_quote" "is_retweet"
[9] "favorite_count" "retweet_count" "quote_count" "reply_count"
[13] "hashtags" "symbols" "urls_url" "urls_t_co"
[17] "urls_expanded_url" "media_url" "media_t_co" "media_expanded_url"
[21] "media_type" "ext_media_url" "ext_media_t_co" "ext_media_expanded_url"
[25] "mentions_user_id" "mentions_screen_name" "lang" "quoted_created_at"
[29] "retweet_status_id" "retweet_text" "retweet_created_at" "retweet_source"
[33] "retweet_favorite_count" "retweet_retweet_count" "retweet_user_id" "retweet_screen_name"
[37] "retweet_name" "retweet_followers_count" "retweet_friends_count" "retweet_statuses_count"
[41] "retweet_verified" "geo_coords" "coords_coords" "bbox_coords"
[45] "status_url" "name" "location" "description"
[49] "protected" "followers_count" "friends_count" "listed_count"
[53] "statuses_count" "favourites_count" "account_created_at" "verified"
[57] "profile_banner_url" "profile_background_url" "profile_image_url" "retweet_location"
[61] "retweet_description" "quoted_status_id" "quoted_text" "quoted_source"
[65] "quoted_favorite_count" "quoted_retweet_count" "quoted_user_id" "quoted_screen_name"
[69] "quoted_name" "quoted_followers_count" "quoted_friends_count" "quoted_statuses_count"
[73] "quoted_location" "quoted_description" "quoted_verified" "url"
[77] "place_url" "place_name" "place_full_name" "place_type"
[81] "country" "country_code" "lat" "lng"
[85] "display_text_width" "reply_to_status_id" "reply_to_user_id" "reply_to_screen_name"
t <- mongo(db="DMUBA", collection="tweet_type")
tweets_types <- t$find()
cat("Cantidades de tweets por tipo \n\n")
Cantidades de tweets por tipo
cat("\t* Tweets: ", nrow(tweets_types), "\n")
* Tweets: 28907
cat("\t* Solo RT: ", nrow(tweets_types[tweets_types$is_retweet & !tweets_types$is_quote,]), "\n")
* Solo RT: 17870
cat("\t* Solo QT: ", nrow(tweets_types[!tweets_types$is_retweet & tweets_types$is_quote,]), "\n")
* Solo QT: 1789
cat("\t* RT y QT: ", nrow(tweets_types[tweets_types$is_retweet & tweets_types$is_quote,]), "\n")
* RT y QT: 3416
cat("\t* TW originales: ", nrow(tweets_types[!tweets_types$is_retweet & !tweets_types$is_quote,]), "\n")
* TW originales: 5832
tweets_types$tipo <- ""
tweets_types[tweets_types$is_retweet & !tweets_types$is_quote,]$tipo <- "Solo RT"
tweets_types[!tweets_types$is_retweet & tweets_types$is_quote,]$tipo <- "Solo QT"
tweets_types[tweets_types$is_retweet & tweets_types$is_quote,]$tipo <- "RQ y RT"
tweets_types[!tweets_types$is_retweet & !tweets_types$is_quote,]$tipo <- "Original"
# names = c('Solo RT', 'Solo QT', 'RT + QT', 'Original')
# cantidades = c(nrow(tweets_types[tweets_types$is_retweet & !tweets_types$is_quote,]),
# nrow(tweets_types[!tweets_types$is_retweet & tweets_types$is_quote,]),
# nrow(tweets_types[tweets_types$is_retweet & tweets_types$is_quote,]),
# nrow(tweets_types[!tweets_types$is_retweet & !tweets_types$is_quote,])
# )
#
grafico_tipos <- data.frame(table(tweets_types$tipo))
# barplot(sort(grafico_tipos$Freq, decreasing=TRUE), legend.text=grafico_tipos$Var1, col=c('red','green','blue','brown'))
# barplot(height=sort(grafico_tipos$Freq, decreasing=TRUE), names=grafico_tipos$Var1, col=rgb(0.2,0.4,0.6,0.6) )
names(grafico_tipos) <- c("Tipos", "Cantidad")
coul <- brewer.pal(5, "Set2")
barplot(height=sort(grafico_tipos$Cantidad, decreasing=TRUE), names=grafico_tipos$Tipos, col=coul )
# coul <- brewer.pal(5, "Set2")
png(filename="tipo_tweet.png", width=1000, bg="white")
ggplot(grafico_tipos, aes(x=reorder(Tipos, Cantidad), y=Cantidad, fill=Tipos)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set2") +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(plot.title = element_text(hjust = 0.5),
axis.text=element_text(size=12),
axis.text.y = element_text( margin = margin(10, 10, 10, 10)),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
legend.text=element_text(size=12)) +
coord_flip()
dev.off()
null device
1
tweets <- mongo(db="DMUBA", collection="tweet_completo_estadisticas")
numericos <- tweets$find()
# Tipos
numericos$Tipo <- ""
numericos[numericos$is_retweet & !numericos$is_quote,]$Tipo <- "Solo RT"
numericos[!numericos$is_retweet & numericos$is_quote,]$Tipo <- "Solo QT"
numericos[numericos$is_retweet & numericos$is_quote,]$Tipo <- "QT y RT"
numericos[!numericos$is_retweet & !numericos$is_quote,]$Tipo <- "Original"
numericos$verificado <- F
numericos[numericos$Tipo == "Solo QT",]$verificado <- numericos[numericos$Tipo == "Solo QT",]$quoted_verified
numericos[numericos$Tipo == "Original",]$verificado <- numericos[numericos$Tipo == "Original",]$verified
numericos[numericos$Tipo == "Solo RT",]$verificado <- numericos[numericos$Tipo == "Solo RT",]$retweet_verified
numericos[numericos$Tipo == "QT y RT",]$verificado <- numericos[numericos$Tipo == "QT y RT",]$retweet_verified
numericos$verificado_grafico <- ""
numericos[numericos$verificado,]$verificado_grafico <- "Si"
numericos[!numericos$verificado,]$verificado_grafico <- "No"
png(filename="tipo_x_tweet_grid2.png", width=1000, bg="white")
ggplot(data=numericos, aes(x=verificado_grafico, fill=Tipo)) +
scale_fill_brewer(palette="Set2") +
geom_bar() +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(plot.title = element_text(hjust = 0.5),
axis.text=element_text(size=10),
axis.text.y = element_text( margin = margin(10, 10, 10, 10)),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
legend.text=element_text(size=10),
aspect.ratio=19/19) +
facet_wrap(~ Tipo, nrow=2)
dev.off()
null device
1
ggplot(data=tweets_types, aes(x=verified, fill=tipo)) +
geom_bar() + facet_wrap(~ tipo, nrow=2)
Hay más usuarios verificados en el contenido nuevo. A su vez, hay más verificos en el contenido citado. Eso habla de que un usuario verificado crea un contenido de mayor calidad (Más difundido y novedoso).
Mientras que el usuario difusor y los retweets, si bien aumentan el alcance de los tweets, no tienen una calidad alta.
t <- mongo(db="DMUBA", collection="fechas")
tweets_fechas <- t$find()
summary(tweets_fecha)
fecha t tc
Min. :2020-04-24 23:52:38 Min. :2020-04-24 23:52:38 2020-05-02 01:44:00: 1049
1st Qu.:2020-05-02 01:46:45 1st Qu.:2020-05-02 01:46:45 2020-05-02 01:40:00: 1043
Median :2020-05-04 20:30:15 Median :2020-05-04 20:30:15 2020-05-02 01:45:00: 1041
Mean :2020-05-06 12:24:30 Mean :2020-05-06 12:24:30 2020-05-02 01:43:00: 1012
3rd Qu.:2020-05-10 12:02:52 3rd Qu.:2020-05-10 12:02:52 2020-05-02 01:41:00: 990
Max. :2020-05-15 18:24:13 Max. :2020-05-15 18:24:13 2020-05-02 01:48:00: 990
(Other) :22782
fecha_str.Length fecha_str.Class fecha_str.Mode
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
[ reached getOption("max.print") -- omitted 28657 rows ]
En un primer intento de graficar, vemos que los datos estan distribuidos de una forma particular. La primera pregunta es ¿Hay alguna fecha que presentó una cantidad anómala de datos?
El 2 de mayo lo es. Sin embargo, no fue un dÃa en el que aconteció alguna cosa. Ni es feriado (1/5), ni fue dÃa de anuncios (25/4).
# plot(tweets_fecha$fecha)
# barplot(table(as.Date(tweets_fecha$fecha)))
f<- data.frame(table(as.Date(tweets_fecha$fecha)))
ggplot(data=f, aes(x=Freq)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Bin size = 3") +
theme_ipsum() +
theme(
plot.title = element_text(size=15)
)
Agrupando en fracciones menos, 5 minutos, vemos que lo que aconteció fue una ventana de captura de datos desigual. Al reducir la ventana de tiempo, vemos que hay una distribución más uniforme. Igualmente sigue planteandose la pregunta, podrÃamos analizarlo de a minutos, o con diferencias porcentuales, para ver si realmente hya algo ahÃ.
tweets_fecha$t <- ymd_hms(tweets_fecha$fecha)
tweets_fecha$tc <- cut(tweets_fecha$t, breaks = "5 min")
cant_5_min <- count(tweets_fecha, tc)
barplot(cant_5_min$n, legend.text=cant_5_min$tc)
## Tweets por fecha
tweets_fecha$t <- ymd_hms(tweets_fecha$fecha)
# Por minuto está más equilibrado)
tweets_fecha$tc <- cut(tweets_fecha$t, breaks = "1 min")
cant_5_min <- count(tweets_fecha, tc)
barplot(cant_5_min$n)
La variable temporal parece ser arbitraria.
Algo a seguir investigando es la ventana temporal entre: * Fecha creada y fecha de creacion del retweet * Fecha creada y fecha de creacion del quoted
library(ggplot2)
library(dplyr)
library("plotly")
Attaching package: 㤼㸱plotly㤼㸲
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
last_plot
The following object is masked from 㤼㸱package:stats㤼㸲:
filter
The following object is masked from 㤼㸱package:graphics㤼㸲:
layout
library(hrbrthemes)
# tweets_fecha$fecha
tweets_fecha$fecha_str <- lapply(tweets_fecha$tc, as.character)
b <- as.POSIXlt(strptime(tweets_fecha$tc, format = "%H:%M:%S"))
cant_5_min$fecha <- as.Date(cant_5_min$tc)
cant_5_min$hora <- format(strptime(cant_5_min$tc, format = "%Y-%m-%d %H:%M:%S"), format="%H:%M:%S")
p <- cant_5_min %>%
ggplot( aes(x=reorder(hora, hora), y=n, fill=n)) +
geom_bar(stat="identity") +
scale_fill_gradient2(low='red', mid='snow3', high='darkgreen', space='Lab') +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
facet_wrap(~ fecha, nrow=4)
png(filename="tipo_x_tweet.png", width=1000, bg="white")
p
dev.off()
null device
1
# Turn it interactive with ggplotly
p <- ggplotly(p)
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NA
p
ggplot(tweets_text, aes(x=cantChars)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
ggtitle("Cantidad de catacteres por tweet") +
theme_ipsum()
tweets_text.df2 <- tweets_text
tweets_text.df2$text <- gsub("http.*","",tweets_text.df2$text)
tweets_text.df2$text <- gsub("https.*","",tweets_text.df2$text)
#Quitando los hashtags y usuarios en los tweets_text
tweets_text.df2$text <- gsub("#\\w+","",tweets_text.df2$text)
tweets_text.df2$text <- gsub("@\\w+","",tweets_text.df2$text)
tweets_text.df2$cantChars <- nchar(tweets_text.df2$text)
ggplot(tweets_text.df2, aes(x=cantChars)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
ggtitle("Cantidad de catacteres por tweet") +
theme_ipsum()
tweets_text.df2$text <- gsub("[[:punct:]]","",tweets_text.df2$text)
tweets_text.df2$text <- gsub("\\w*[0-9]+\\w*\\s*", "",tweets_text.df2$text)
tweets_text.df2$cantChars <- nchar(tweets_text.df2$text)
ggplot(tweets_text.df2, aes(x=cantChars)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
ggtitle("Sin caracteres especiales y numeros") +
theme_ipsum()
user_estadisticas <- mongo(db="DMUBA", collection="user_estadisticas")
summary(info_user)
user_id screen_name name description followers_count friends_count
Length:25435 Length:25435 Length:25435 Length:25435 Min. : 0 Min. : 0
Class :character Class :character Class :character Class :character 1st Qu.: 92 1st Qu.: 184
Mode :character Mode :character Mode :character Mode :character Median : 313 Median : 441
Mean : 12581 Mean : 1242
3rd Qu.: 999 3rd Qu.: 1073
Max. :18609108 Max. :971277
listed_count statuses_count favourites_count account_created_at verified user_popularity
Min. : 0.00 Min. : 1 Min. : 0 Min. :2007-02-15 14:03:49 Min. :0.00001 Length:25435
1st Qu.: 0.00 1st Qu.: 2209 1st Qu.: 1116 1st Qu.:2011-05-13 00:01:30 1st Qu.:0.00001 Class :character
Median : 1.00 Median : 9386 Median : 5514 Median :2013-12-25 05:01:22 Median :0.00001 Mode :character
Mean : 51.04 Mean : 34748 Mean : 19285 Mean :2014-08-19 17:22:36 Mean :0.01444
3rd Qu.: 6.00 3rd Qu.: 31359 3rd Qu.: 19238 3rd Qu.:2018-04-02 03:49:06 3rd Qu.:0.00001
Max. :57770.00 Max. :7203370 Max. :1265094 Max. :2020-05-15 18:08:33 Max. :1.00000
# User base
info_user <- user_estadisticas$find()
# Con log sin los que tiene 0
data_log <- as.data.frame(apply(info_user[,5:9], 2, log))
# Log con los que tiene 0
info_user[info_user == 0] <- 0.00001
data_log_1 <- as.data.frame(apply(info_user[,5:9], 2, log))
cat("Cantidad de usuarios que han twitteado: ", nrow(info_user))
Cantidad de usuarios que han twitteado: 25435
# ggpairs(data_log)
# ggpairs(info_user[,1:5])
boxplot(info_user[,5:9])
# Con 0's
boxplot(data_log)
# Con 0.0000001's
boxplot(data_log_1)
#
# info_user$verificado <- ifelse(info_user$verified, "Verificados", "Sin verificar")
# info_user$verificado <- as.factor(info_user$verificado)
Todo: * Juntar usuarios finales, usuarios que fueron replicados * Que hace que un usuario sea más divulgado? Hay alguna medida de relevancia de un usuario? Aquellos más populares (Segun que criterio?) son de que tipo? Instituciones, usuarios comunes, bots? Que tan activos son? Influye eso? Desde que dispositivo lo hacen? Que tipo de texto crean? Que hashtags usan? De que regiones son? Hay algo interesante ah� Hay predominio de algun pais? Hay paises donde se usa más el twitter?
user_tweets_estadisticas <- mongo(db="DMUBA", collection="user_tweets_estadisticas")
# User base
info_user <- user_tweets_estadisticas$find()
summary(info_user$is_none)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 1.000 1.137 1.000 31.000
info_user[info_user == 0] <- 0.00001
data_log_1 <- as.data.frame(apply(info_user[,3:7], 2, log))
# Plot de grupos
plot(sort(data_log_1$is_rt))
plot(sort(data_log_1$is_only_rt))
plot(sort(data_log_1$is_only_qt))
plot(sort(data_log_1$is_none))
plot(sort(data_log_1$is_qt))
TODO: Binning con esto? Alinear distintos grupos en cada categoria? Solo clasificarlos?
head(info_user[order(info_user$count, decreasing = T),])
Curiosamente, los usuarios finales con más tweets son creadores. serán bots?
info_user$tipo <- ifelse(info_user$is_none > info_user$is_only_rt + info_user$is_only_rt, "Creador", "Difusor")
barplot(table(info_user$tipo))
info_user_graf <- data.frame(table(info_user$tipo))
names(info_user_graf) <- c("Tipo_usuario", "Cantidad")
png(filename="tipo_usuario_creacion.png", width=1000, bg="white")
ggplot(info_user_graf, aes(x=reorder(Tipo_usuario , Cantidad), y=Cantidad, fill=Tipo_usuario)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set2") +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(plot.title = element_text(hjust = 0.5),
axis.text=element_text(size=14),
axis.text.y = element_text( margin = margin(10, 10, 10, 10)),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
legend.text=element_text(size=14),
# theme(plot.title = element_text(hjust = 0.5),
# axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
) +
coord_flip()
dev.off()
png
2
user_estadisticas <- mongo(db="DMUBA", collection="user_estadisticas")
info_user <- user_estadisticas$find()
data_log <- as.data.frame(apply(info_user[,5:9], 2, log10))
info_user[info_user == 0] <- 0.00001
info_user[is.na(info_user)] <- 0.0001
data_log_1 <- as.data.frame(apply(info_user[,5:9], 2, log10))
Correllations
ggpairs(data_log_1)
There were 25 warnings (use warnings() to see them)
bin_eq_freq <- discretize(data_log_1$followers_count,"equalfreq", 20)
bin_eq_freq$followers_count = data_log_1$followers_count
# Por cada bin calculamos la media y reemplazamos en el atributo suavizado
for(bin in 1:20){
bin_eq_freq$suavizado[ bin_eq_freq$X==bin] = mean(bin_eq_freq$followers_count[ bin_eq_freq$X==bin])
}
# grafico Sepal.Width ordenado de menor a mayor
plot(sort(data_log_1$followers_count) , type = "p", col="red",
ylab = "followers_count", xlab = "Observaciones", main = "Dato original vs suavizado")
# Agrego la serie de la variable media
lines(sort(bin_eq_freq$suavizado),
type = "p", col="blue")
legend("topleft", legend=c("Original", "Suavizado"), col=c("red", "blue"), lty=1)
bin_eq_freq <- discretize(data_log_1$listed_count,"equalfreq", 20)
bin_eq_freq$listed_count = data_log_1$listed_count
# Por cada bin calculamos la media y reemplazamos en el atributo suavizado
for(bin in 1:20){
bin_eq_freq$suavizado[ bin_eq_freq$X==bin] = mean(bin_eq_freq$listed_count[ bin_eq_freq$X==bin])
}
# grafico Sepal.Width ordenado de menor a mayor
plot(sort(data_log_1$listed_count) , type = "p", col="red",
ylab = "listed_count", xlab = "Observaciones", main = "Dato original vs suavizado")
# Agrego la serie de la variable media
lines(sort(bin_eq_freq$suavizado),
type = "p", col="blue")
legend("topleft", legend=c("Original", "Suavizado"), col=c("red", "blue"), lty=1)
# no_na_data <- data_log_1[!is.na(data_log_1$statuses_count),]
bin_eq_freq <- discretize(data_log_1$statuses_count,"equalwidth", 5)
bin_eq_freq$statuses_count = data_log_1$statuses_count
# Por cada bin calculamos la media y reemplazamos en el atributo suavizado
for(bin in 1:5){
bin_eq_freq$suavizado[ bin_eq_freq$X==bin] = mean(bin_eq_freq$statuses_count[ bin_eq_freq$X==bin])
}
# grafico Sepal.Width ordenado de menor a mayor
plot(sort(data_log_1$statuses_count) , type = "p", col="red",
ylab = "statuses_count", xlab = "Observaciones", main = "Dato original vs suavizado")
# Agrego la serie de la variable media
lines(sort(bin_eq_freq$suavizado),
type = "p", col="blue")
legend("topleft", legend=c("Original", "Suavizado"), col=c("red", "blue"), lty=1)
# no_na_data <- data_log_1[!is.na(data_log_1$favourites_count),]
bin_eq_freq <- discretize(data_log_1$favourites_count,"equalwidth", 10)
bin_eq_freq$favourites_count = data_log_1$favourites_count
# Por cada bin calculamos la media y reemplazamos en el atributo suavizado
for(bin in 1:10){
bin_eq_freq$suavizado[ bin_eq_freq$X==bin] = mean(bin_eq_freq$favourites_count[ bin_eq_freq$X==bin])
}
# grafico Sepal.Width ordenado de menor a mayor
plot(sort(data_log_1$favourites_count) , type = "p", col="red",
ylab = "favourites_count", xlab = "Observaciones", main = "Dato original vs suavizado")
# Agrego la serie de la variable media
lines(sort(bin_eq_freq$suavizado),
type = "p", col="blue")
legend("topleft", legend=c("Original", "Suavizado"), col=c("red", "blue"), lty=1)
TODO: * Dentro de los creadores, alguno fue retweteado? Citado? Cual es el impacto de los creadores? * Dentro de los difusores, que impacto tienen? Que relevancia tienen los creadores originales? Cuando tweets fueron amplificados más de una vez en el grupo de twitteros finales? * Es muy simplista esto? Funciona? Hay dispositivos privilegiados? Usan software para publicaciones los creadores? Los difusores? * Entre los creadores, hay verificados? Hay alguna forma de evaluar la confiabilidad o la veracidad de lo que dicen? * Entre los difusores, hay fake news? Hay difusion indiscriminada? Hay relacion entre algun par de usuarios? Hay alguna persona que tiene más difusion que otra? s
tweets <- mongo(collection = "tweets_lower", db = "DMUBA")
df_location <- tweets$aggregate('[{
"$project": {
"_id": "$_id",
"location": "$location",
"retweet_location": "$retweet_location",
"quoted_location": "$quoted_location",
"country_code": "$country_code",
"country": "$country",
"lat": "$lat",
"lng": "$lng"
}
}
]')
nombre_location <- c("location", "retweet", "quoted", "country_code", "country", "lat", "lng")
cant_unique <- c( length(unique(df_location$location))
,length(unique(df_location$retweet_location))
,length(unique(df_location$quoted_location))
,length(unique(df_location$country_code))
,length(unique(df_location$country))
,length(unique(df_location$lat))
,length(unique(df_location$lng)))
cant_na <- c( nrow(df_location[is.na(df_location$location),])
, nrow(df_location[is.na(df_location$retweet_location),])
, nrow(df_location[is.na(df_location$quoted_location),])
, nrow(df_location[is.na(df_location$country_code),])
, nrow(df_location[is.na(df_location$country),])
, nrow(df_location[is.na(df_location$lat),])
, nrow(df_location[is.na(df_location$lng),])
)
df <- do.call(rbind, Map(data.frame, A=nombre_location, B=cant_unique, C=cant_na))
names(df)[1] <- "Atributo"
names(df)[2] <- "Unique"
names(df)[3] <- "Na"
df$porcentaje_na <- df$Na / nrow(df_location) * 100
There were 22 warnings (use warnings() to see them)
# png(filename="location_porc_na.png", width=1000, bg="white")
ggplot(df, aes(x=reorder(Atributo, porcentaje_na), y=porcentaje_na, fill=Atributo)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set2") +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(plot.title = element_text(hjust = 0.5),
axis.text=element_text(size=14),
axis.text.y = element_text( margin = margin(10, 10, 10, 10)),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
legend.text=element_text(size=14),
aspect.ratio = 1/1
) +
coord_flip()
# dev.off()
paises_en <- read.csv("C:\\Users\\Lucas\\Desktop\\2019\\Data minning\\DataMiningUba2020\\Tps\\Tp1\\countries.en.csv", header = T, sep = ';')
paises_es <- read.csv("C:\\Users\\Lucas\\Desktop\\2019\\Data minning\\DataMiningUba2020\\Tps\\Tp1\\countries.es.csv", header = T, sep = ';')
head(paises_en)
head(paises_es)
paises_es$Pais <- trimws(paises_es$Pais, which = "both")
paises_es$Codigo <- trimws(paises_es$Codigo, which = "both")
paises_es$Region <- trimws(paises_es$Region, which = "both")
paises_es$Continente <- trimws(paises_es$Continente, which = "both")
df_location$country_2 <- df_location$country
df_location$country_code_2 <- df_location$country_code
for(i in paises_en$NAME){
print(i)
df_location$country_2 <- ifelse(grepl(tolower(i), tolower(df_location$location), fixed= T), tolower(i), df_location$country_2)
}
for(i in paises_en$NAME){
print(i)
if (nrow(df_location[!is.na(df_location$country_2) & df_location$country_2 == tolower(i),]) > 0) {
# df_location$country_code_2[!is.na(df_location$country_2) & df_location$country_2 == tolower('spain')] <- as.character.factor(paises_en$ISO[paises_en$NAME == 'SPAIN'])
# df_location$country_code_2[!is.na(df_location$country_2) & df_location$country_2 == tolower(i)] <- as.character.factor(paises_en$ISO[paises_en$NAME == i])
df_location[!is.na(df_location$country_2) & df_location$country_2 == tolower(i),]$country_code_2 <- paises_en$ISO[paises_en$NAME == i]
}
}
for(i in paises_es$Pais){
print(i)
# ifelse(grepl("Argentina", df_location$location, fixed= T), "Argentina", "")
df_location$country_2 <- ifelse(grepl(tolower(i), tolower(df_location$location), fixed= T), tolower(i), df_location$country_2)
}
for(i in paises_es$Pais){
print(i)
if (nrow(df_location[!is.na(df_location$country_2) & df_location$country_2 == tolower(i),]) > 0) {
# df_location$country_code_2[!is.na(df_location$country_2) & df_location$country_2 == tolower('spain')] <- as.character.factor(paises_en$ISO[paises_en$NAME == 'SPAIN'])
# df_location$country_code_2[!is.na(df_location$country_2) & df_location$country_2 == tolower(i)] <- as.character.factor(paises_en$ISO[paises_en$NAME == i])
df_location[!is.na(df_location$country_2) & df_location$country_2 == tolower(i),]$country_code_2 <- paises_es$Codigo[paises_es$Pais == i]
}
# df_location$country_code_2[!is.na(df_location$country_2) & df_location$country_2 == tolower(i)] <- as.character.factor(paises_es$Codigo[paises_es$Pais == i])
}
#Unimos continente
for(i in paises_es$Codigo){
print(i)
df_location$Region[!is.na(df_location$country_code_2) & df_location$country_code_2 == i] <- paises_es$Region[paises_es$Codigo == i]
df_location$Continente[!is.na(df_location$country_code_2) & df_location$country_code_2 == i] <- paises_es$Continente[paises_es$Codigo == i]
}
t <- mongo(collection = "tweets_lower", db = "DMUBA")
aux <- t$aggregate('[{"$project":{"_id": "$_id","user_id":"$user_id","screen_name":"$screen_name","text":"$description"}}]')
aux$text <- tolower(aux$text)
aux$text <- gsub("http.*","",aux$text)
aux$text <- gsub("https.*","",aux$text)
# #Quitando los hashtags y usuarios en los tweets_text
# aux$text <- gsub("#\\w+","",aux$text)
aux$text <- gsub("@\\w+","",aux$text)
aux$text <- gsub("[[:punct:]]","",aux$text)
aux$text <- gsub("\\w*[0-9]+\\w*\\s*", "",aux$text)
aux$text <- gsub("[[:punct:]]","",aux$text)
aux$text <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", aux$text)
aux$text <- iconv(aux$text,from="UTF-8",to="ASCII//TRANSLIT")
palabras_noticias <- c("noticia", "periodismo", "periodista", 'periodico', "news", 'journalist', "reportero", "programa de tv", 'television', 'Reuters ', 'elpaisamerica', 'productora', 'conductor', 'columnista', 'corresponsal', 'telesur')
aux$is_news_related <- F
for (i in palabras_noticias) {
aux$is_news_related <- ifelse(grepl(i, aux$text, fixed= T), T, aux$is_news_related)
}
palabras_politica <- c("politico", "senador", "diputado", "alcalde", "subsecretario", "secretario", "secretaria", "presidencia", "presidente", "ministerio", "ministro", "ministra", "público", "publico", "canciller", "Partido Socialista", "PSUV", "partido del pueblo", 'asamblea nacional')
aux$is_politic_related <- F
for (i in palabras_politica) {
aux$is_politic_related <- ifelse(grepl(i, aux$text, fixed= T), T, aux$is_politic_related)
}
# barplot(tweets$is_news_related)
# barplot(tweets$is_politic_related)
aux$tipo_user = "Normal"
aux[aux$is_news_related,]$tipo_user <- "Medio"
aux[aux$is_politic_related,]$tipo_user <- "Politica"
aux$is_news_related <- NULL
aux$is_politic_related <- NULL
aux$text <- NULL
aux[aux$tipo_user=='Politica',]
# tweets <- merge(tweets, aux, by="tweet_id")
aux %>% group_by(screen_name) %>% summarise(tipo = max(tipo_user))
# table(aux$tipo_user)
aux <- t$aggregate('[{"$project":{"_id":"$_id","user_id":"$retweet_user_id","screen_name":"$retweet_screen_name","text":"$retweet_description"}}]')
aux <- aux[!is.na(aux$screen_name),]
aux$text <- tolower(aux$text)
aux$text <- gsub("http.*","",aux$text)
aux$text <- gsub("https.*","",aux$text)
# #Quitando los hashtags y usuarios en los tweets_text
# aux$text <- gsub("#\\w+","",aux$text)
aux$text <- gsub("@\\w+","",aux$text)
aux$text <- gsub("[[:punct:]]","",aux$text)
aux$text <- gsub("\\w*[0-9]+\\w*\\s*", "",aux$text)
aux$text <- gsub("[[:punct:]]","",aux$text)
aux$text <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", aux$text)
aux$text <- iconv(aux$text,from="UTF-8",to="ASCII//TRANSLIT")
palabras_noticias <- c("noticia", "periodismo", "periodista", 'periodico', "news", 'journalist', "reportero", "programa de tv", 'television', 'Reuters ', 'elpaisamerica', 'productora', 'conductor', 'columnista', 'corresponsal', 'telesur')
aux$is_news_related <- F
for (i in palabras_noticias) {
aux$is_news_related <- ifelse(grepl(i, aux$text, fixed= T), T, aux$is_news_related)
}
palabras_politica <- c("politico", "senador", "diputado", "alcalde", "subsecretario", "secretario", "secretaria", "presidencia", "presidente", "ministerio", "ministro", "ministra", "público", "publico", "canciller", "Partido Socialista", "PSUV", "partido del pueblo", 'asamblea nacional')
aux$is_politic_related <- F
for (i in palabras_politica) {
aux$is_politic_related <- ifelse(grepl(i, aux$text, fixed= T), T, aux$is_politic_related)
}
# barplot(tweets$is_news_related)
# barplot(tweets$is_politic_related)
aux$tipo_user = "Normal"
aux[aux$is_news_related,]$tipo_user <- "Medio"
aux[aux$is_politic_related,]$tipo_user <- "Politica"
aux$is_news_related <- NULL
aux$is_politic_related <- NULL
aux$text <- NULL
aux[aux$tipo_user=='Politica',]
# tweets <- merge(tweets, aux, by="tweet_id")
aux %>% group_by(screen_name) %>% summarise(tipo = max(tipo_user))
# table(aux$tipo_user)
names(user)
[1] "_id" "user_id" "retweet_screen_name" "tipo_user"
names(aux)
[1] "_id" "user_id" "screen_name" "tipo_user"